Load data

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.0 ──
## ✔ broom        1.0.5     ✔ rsample      1.1.1
## ✔ dials        1.2.0     ✔ tibble       3.2.1
## ✔ dplyr        1.1.2     ✔ tidyr        1.3.0
## ✔ infer        1.0.4     ✔ tune         1.1.1
## ✔ modeldata    1.1.0     ✔ workflows    1.1.3
## ✔ parsnip      1.1.0     ✔ workflowsets 1.0.1
## ✔ purrr        1.0.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.6
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# Values of conditions to test
flow.rate <- 0  # 0 0.6 3 5.9 8.9
chlorophyll <- 0 # 0 4.3 4.6 5.5 6.1 7.6 13.5 19
guano <- 1 # Absent=1 Present=2
light <- 1 # Absent=1 Present=2

# Vectors of all possible conditions combinations
frs <- as.numeric(as.character(unique(CC.TotalData$Flow.rate)))
chls <- as.numeric(as.character(unique(CC.TotalData$Chlorophyll)))
guans <- c(1,2)
lights <- c(1,2)
conditions <- expand.grid(frs,chls,guans,lights)

Autocorrelation looking at linear regression and residuals. Note that residuals are not normally distributed.

velocity <- CC.TotalData$v[
  (CC.TotalData$Flow.rate==flow.rate & 
     CC.TotalData$Chlorophyll==chlorophyll & 
     as.numeric(CC.TotalData$Guano)==guano &
     as.numeric(CC.TotalData$Light)==light)]

x <- log10(velocity[1:length(velocity)-1])
y <- log10(velocity[2:length(velocity)])


vel.auto.lm <- lm(x ~ y + 1)
summary(vel.auto.lm)
## 
## Call:
## lm(formula = x ~ y + 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1940 -0.2416  0.0330  0.2671  2.9707 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.742378   0.002666  -278.5   <2e-16 ***
## y            0.500275   0.001704   293.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.424 on 258075 degrees of freedom
## Multiple R-squared:  0.2503, Adjusted R-squared:  0.2503 
## F-statistic: 8.615e+04 on 1 and 258075 DF,  p-value: < 2.2e-16
plot(hexbin(x,
            y,
            xbins = 100),
     xlab='Log velocity (m/s)',
     ylab='Log velocity (m/s) next time step',
     legend = 0)

hist(log10(velocity),100)

hist(vel.auto.lm$residuals,100)

qqnorm(vel.auto.lm$residuals,main="Normal Q-Q plot")
qqline(vel.auto.lm$residuals)

ks.test(vel.auto.lm$residuals,'pnorm')
## Warning in ks.test.default(vel.auto.lm$residuals, "pnorm"): ties should not be
## present for the Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  vel.auto.lm$residuals
## D = 0.21517, p-value < 2.2e-16
## alternative hypothesis: two-sided

Loop through combinations

for (i in 1:dim(conditions)[1])
{
  velocity <- CC.TotalData$v[
  (CC.TotalData$Flow.rate==conditions[i,1] & 
     CC.TotalData$Chlorophyll==conditions[i,2] & 
     as.numeric(CC.TotalData$Guano)==conditions[i,3] &
     as.numeric(CC.TotalData$Light)==conditions[i,4])]
  if (length(velocity) <= 1) {
    conditions[i,5] <- NA } else {
      c<-cor.test(log10(velocity[1:length(velocity)-1]),
                  log10(velocity[2:length(velocity)]))
      conditions[i,5] <- c$estimate
        v0 <- log10(velocity[1:length(velocity)-1])
        v1 <- log10(velocity[2:length(velocity)])
        df <- data.frame(v0,v1)
        fit <- lm(v1 ~ v0 + 1, data = df)
        conditions[i,6] <- fit$coefficients[1]
        conditions[i,7] <- fit$coefficients[2]
        d <- dip.test(velocity)
        conditions[i,8] <- d$p.value
        hist(log10(velocity),100,
             main=d$p.value)
  }  
}
## n = 258078 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 116154 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 266817 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 211577 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 253471 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 192727 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 271850 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 91500 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 137251 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 282203 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 140189 > max_n{n in table} = 72000 -- using that as asymptotic value.

## n = 142013 > max_n{n in table} = 72000 -- using that as asymptotic value.

conditions <- setNames(conditions,c("flow.rate","chlorophyll","guano","light","corr.val","slope","intercept","dip.test"))

Using random forest to fit the missing values

Idata <- which(!is.na(conditions[,5])) # Index of data
Iblank <- which(is.na(conditions[,5])) # Index of blank data

conditions_data <- conditions[Idata,]
conditions_blank <- conditions[Iblank,]


# Tidy Models version
#data_split <- initial_split(conditions_data, prop = 0.6)
#data_test <- testing(data_split)
#data_train <- training(data_split)
#data_folds <- vfold_cv(data_train, 
#                       v = 5, # number of partitions in dataset 
#                       repeats = 1)
#mc_split <- mc_cv(data_train, 
#                  prop = 9/10, # proportion to use per resample
#                  times = 20) # number of resamples
#template_data <- training(data_split)


# Random forest package version - doing dip test
conditions.rf <- randomForest(dip.test ~ .,
                              data=select(conditions_data,c(1,2,3,4,8)),
                              importance=TRUE,
                              proximity=TRUE)
#print(conditions.rf)
#round(importance(conditions.rf), 2)
#sqrt(conditions.rf$mse[which.min(conditions.rf$mse)]) 
plot(conditions.rf)

varImpPlot(conditions.rf)

conditions_blank$dip.test <- predict(conditions.rf,conditions_blank[,1:4])
fit.dip.test <- predict(conditions.rf,conditions_data[,1:4])
plot(fit.dip.test,conditions_data$dip.test)

plot(conditions_blank$chlorophyll,conditions_blank$dip.test)

plot(conditions_blank$flow.rate,conditions_blank$dip.test)

plot(conditions_blank$light,conditions_blank$dip.test)

plot(conditions_blank$guano,conditions_blank$dip.test)

Random forest test for corr

conditions.rf <- randomForest(corr.val ~ .,
                              data=select(conditions_data,c(1,2,3,4,5)),
                              importance=TRUE,
                              proximity=TRUE)
#print(conditions.rf)
#round(importance(conditions.rf), 2)
#sqrt(conditions.rf$mse[which.min(conditions.rf$mse)]) 
plot(conditions.rf)

varImpPlot(conditions.rf)

conditions_blank$corr.val <- predict(conditions.rf,conditions_blank[,1:4])
fit.corr.val <- predict(conditions.rf,conditions_data[,1:4])
plot(fit.corr.val,conditions_data$corr.val)

plot(conditions_blank$chlorophyll,conditions_blank$corr.val)

plot(conditions_blank$flow.rate,conditions_blank$corr.val)

plot(conditions_blank$light,conditions_blank$corr.val)

plot(conditions_blank$guano,conditions_blank$corr.val)